
//-----------------------------------------------------------------------
//
// Gentrification and pioneer businesses
// Kristian Behrens, Brahim Boualam, Julien Martin, and Florian Mayneris
//
// First version: 17/02/2015
// This  version: 17/11/2021
//
// Preparation of the NHGIS census variables for New York
//
//-----------------------------------------------------------------------

// Do-file for concording geographical data, and for apportioning data
// using Census Blocks (either with population weights or proportional to the
// surface of the blocks)


// THIS IS FOR THE EDUCATION VARIABLES, BROKEN DOWN FROM BLOCK GROUP TO BLOCK


cd "/Users/kristianbehrens/Desktop/GENTRI_RESTAT_FINAL/"


clear
clear matrix

set more off


// This is the list of counties for the NY MSA (we use the 2010 definition)
#delimit ; 

local listcountyNYMSA
34003	36005	36027	34013	34017	34019	36047	34023	34025	34027
36059	36061	34029	36071	34031	42103	36079	36081	36085	36087
34035	36103	34037	34039	36119;

local listvar
001 002 003 004 005;

local years
90 00 10;

#delimit cr 



// -----------------------------------------------------------------
// BEGIN: MATCHING OF NHGIS 1990 BLOCKGROUP DATA

// STEP 1

insheet using "data/census/nhgis_edu_bgroup/nhgis0011_ds123_1990_blck_grp.csv", delimit(",")

gen temp1 = ""
gen temp2 = ""
gen temp3 = ""
gen temp4 = ""
gen geoid90 = ""

gen test = substr(gisjoin, 2, 100)

if (strlen(gisjoin) == 13) {
	replace temp1 = substr(test, 1, 2)
	replace temp2 = substr(test, 4, 3)
	replace temp3 = substr(test, 8, 4)
	replace temp4 = substr(test, 12, 1)
	replace geoid90 = temp1 + temp2 + temp3 + temp4
}

if (strlen(gisjoin) == 15) {
	replace temp1 = substr(test, 1, 2)
	replace temp2 = substr(test, 4, 3)
	replace temp3 = substr(test, 8, 6)
	replace temp4 = substr(test, 14, 1)
	replace geoid90 = temp1 + temp2 + temp3 + temp4
}

drop temp* test

sort geoid90
drop if geoid90[_n-1] == geoid90[_n]


keep geoid90 e33004 e33005 e33006 e33007
gen edu_college90 = e33004 + e33005 + e33006 + e33007

drop e33004 e33005 e33006 e33007

//Keep only NYMSA counties
gen ctid = substr(geoid90, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1

order geoid90 edu_college90
drop ctid temp
save "temp/temp_variables.dta", replace


//STEP 2

//Merge with the 1990 geography files
use "data/geography/geog_new_york/geography1990.dta", clear


// Required for 1990 data, takes care of the block suffix
gen full_bg_id = ""
replace full_bg_id = substr(geoid90, 1, strlen(geoid90) - 2) if blocksuffix90 == ""
replace full_bg_id = substr(geoid90, 1, strlen(geoid90) - 3) if blocksuffix90 != ""

gen ctid = substr(geoid90, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1
drop temp ctid

ren geoid90 geoid90_back
ren full_bg_id geoid90

sort geoid90

merge m:1 geoid90 using "temp/temp_variables.dta"

tab _merge
drop if _merge == 2
drop _merge

ren geoid90 geoid90_blgroup
ren geoid90_back geoid90

egen pop_all90 = sum(pop90), by(geoid90_blgroup)
egen blgroup_surface = sum(aland90), by(geoid90_blgroup)


// STEP 3: BREAKING THE DATA DOWN BY POPULATION SHARE

gen pop_share = 0
replace pop_share = pop90/pop_all90 if pop_all90 != 0

gen edu_college90_block = edu_college90*pop_share

drop pop_share

gen year = 1990
keep year geoid90 edu_college90_block

ren geoid90 geoid
ren edu_college90_block edu_college_block

order geoid year edu_college_block

save "temp/edu1990.dta", replace

// END: MATCHING OF NHGIS 1990 BLOCKGROUP DATA
// -----------------------------------------------------------------


clear


// -----------------------------------------------------------------
// BEGIN: MATCHING OF NHGIS 2000 BLOCKGROUP DATA


// STEP 1: GET THE BLOCKGROUP DATA

insheet using "data/census/nhgis_edu_bgroup/nhgis0011_ds152_2000_blck_grp.csv", delimit(",")

gen test = substr(gisjoin, 2, 100)
gen temp1 = substr(test, 1, 2)
gen temp2 = substr(test, 4, 3)
gen temp3 = substr(test, 8, 6)
gen temp4 = substr(test, 14, 1)
gen geoid00 = temp1 + temp2 + temp3 + temp4


drop test temp*
keep geoid00 hd1010-hd1016 hd1026-hd1032
gen edu_college00 = hd1010 + hd1011 + hd1012 + hd1013 + hd1014 + hd1015 + hd1016 + hd1026 + hd1027 + hd1028 + hd1029 + hd1030 + hd1031 + hd1032

drop hd1010-hd1016 hd1026-hd1032

//Keep only NYMSA counties
gen ctid = substr(geoid00, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1

order geoid00 edu_college00
drop ctid temp
save "temp/temp_variables.dta", replace


//STEP 2: MERGE WITH THE BLOCK GEOGRAPHY FILES

use "data/geography/geog_new_york/geography2000.dta", clear

gen ctid = substr(geoid00, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1
drop temp ctid

ren geoid00 geoid00_back
gen geoid00 = substr(geoid00_back, 1, 12)

sort geoid00

merge m:1 geoid00 using "temp/temp_variables.dta"

tab _merge
drop _merge

ren geoid00 geoid00_blgroup
ren geoid00_back geoid00

// NOTE: pop_all00 is greater that sum(pop_white00 + pop_black00 + pop_asian00 + pop_other00)
//       because the data on race is for 'one race' only (so people with several races
//       are not counted in there.
egen pop_all00 = sum(pop00), by(geoid00_blgroup)
egen blgroup_surface = sum(aland00), by(geoid00_blgroup)


// STEP 3: BREAKING THE DATA DOWN BY POPULATION SHARE

gen pop_share = 0
replace pop_share = pop00/pop_all00 if pop_all00 != 0

gen edu_college00_block = edu_college00*pop_share

drop pop_share

gen year = 2000
keep year geoid00 edu_college00_block

ren geoid00 geoid
ren edu_college00_block edu_college_block

order geoid year edu_college_block

save "temp/edu2000.dta", replace


// END: MATCHING OF NHGIS 2000 BLOCKGROUP DATA
// -----------------------------------------------------------------



clear



// -----------------------------------------------------------------
// BEGIN: MATCHING OF NHGIS 2010 BLOCKGROUP DATA

// STEP 1

insheet using "data/census/nhgis_edu_bgroup/nhgis0011_ds176_20105_2010_blck_grp.csv", delimit(",")

gen test = substr(gisjoin, 2, 100)
gen temp1 = substr(test, 1, 2)
gen temp2 = substr(test, 4, 3)
gen temp3 = substr(test, 8, 6)
gen temp4 = substr(test, 14, 1)
gen geoid10 = temp1 + temp2 + temp3 + temp4


drop test temp*
keep geoid10 jn9e012-jn9e018 jn9e029-jn9e035
gen edu_college10 = jn9e012 + jn9e013 + jn9e014 + jn9e015 + jn9e016 + jn9e017 + jn9e018 + jn9e029 + jn9e030 + jn9e031 + jn9e032 + jn9e033 + jn9e034 + jn9e035

drop jn9e012-jn9e018 jn9e029-jn9e035

//Keep only NYMSA counties
gen ctid = substr(geoid10, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1

order geoid10 edu_college10
drop ctid temp
save "temp/temp_variables.dta", replace


//STEP 2

//Merge with the 2010 geography files
use "data/geography/geog_new_york/geography2010.dta", clear

gen ctid = substr(geoid10, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1
drop temp ctid

ren geoid10 geoid10_back
gen geoid10 = substr(geoid10_back, 1, 12)

sort geoid10

merge m:1 geoid10 using "temp/temp_variables.dta"

tab _merge
drop _merge

ren geoid10 geoid10_blgroup
ren geoid10_back geoid10

// NOTE: pop_all00 is greater that sum(pop_white00 + pop_black00 + pop_asian00 + pop_other00)
//       because the data on race is for 'one race' only (so people with several races
//       are not counted in there.
egen pop_all10 = sum(pop10), by(geoid10_blgroup)
egen blgroup_surface = sum(aland10), by(geoid10_blgroup)


// STEP 3: BREAKING THE DATA DOWN BY POPULATION SHARE

gen pop_share = 0
replace pop_share = pop10/pop_all10 if pop_all10 != 0

gen edu_college10_block = edu_college10*pop_share

drop pop_share

gen year = 2010
keep year geoid10 edu_college10_block

ren geoid10 geoid
ren edu_college10_block edu_college_block

order geoid year edu_college_block

save "temp/edu2010.dta", replace


// END: MATCHING OF NHGIS 2010 BLOCKGROUP DATA
// -----------------------------------------------------------------


// Assemble the files

append using "temp/edu2000.dta" "temp/edu1990.dta"

sort year geoid

save "results/NewYork/census_variables_education.dta", replace



// Adjust the geoid codes for 1990
// The 1990 block data comes from the NHGIS, with NHGIS identifiers
// The concordance data comes from the Census with the Census identifiers

// We need to transform the NHGIS identifiers (in the geography1990.dta
// files to Census identifiers)

gen test1 = substr(geoid, 1, 9)
gen test2 = substr(geoid, 10, 3) if strlen(geoid) == 12
gen test3 = test1 + "00" + test2
replace  geoid = test3 if strlen(geoid) == 12

drop test*

gen test1 = substr(geoid, 1, 9)
gen test2 = substr(geoid, 10, 4) if strlen(geoid) == 13
gen test3 = test1 + "00" + test2
replace  geoid = test3 if strlen(geoid) == 13

drop test*

save "results/NewYork/census_variables_education.dta", replace

